# -*- coding: utf-8 -*-
'''
Created on 2017年5月23日

@author: ZhuJiahui506
'''

import time
import os
import thulac
import re
from global_info.global_nlp import GlobalNLP
from nlp_utils.thulac_word_segment import get_stopwords
from file_utils.file_reader import read_to_1d_list
from file_utils.file_writer import quick_write_1d_to_text


def word_segment(train_texts, train_tags, write_filename1, write_filename2):
    
    train_seg_texts = []
    train_filtered_tags = []
    
    # 中文分词
    stopwords_list = get_stopwords()
    thu = thulac.thulac(T2S=True, filt=True, seg_only=True, deli=GlobalNLP.CN_WORD_INNER_DELIMITER)
    for i in range(len(train_texts)):
        this_text = re.sub(r'[\t_~=/\+\-\*\|\\]', " ", train_texts[i])
        segment = thu.cut(this_text)
        this_seg_list = []
        for item in segment:
            if item[0] not in stopwords_list:
                this_seg_list.append(item[0])
        
        if (len(segment) >= 10):
            train_seg_texts.append(" ".join(this_seg_list))
            train_filtered_tags.append(train_tags[i])
    
    quick_write_1d_to_text(write_filename1, train_seg_texts)
    quick_write_1d_to_text(write_filename2, train_filtered_tags)


def test_word_segment():
    start = time.clock()
    now_directory = os.getcwd()
    root_directory = os.path.dirname(now_directory) + '/'
    read_filename1 = root_directory + 'dataset/all_news.txt'
    read_filename2 = root_directory + 'dataset/all_news_tags.txt'
    write_filename1 = root_directory + 'dataset/train_segment_texts.txt'
    write_filename2 = root_directory + 'dataset/train_segment_tags.txt'
    
    train_texts = read_to_1d_list(read_filename1)
    train_tags = read_to_1d_list(read_filename2)
    
    word_segment(train_texts, train_tags, write_filename1, write_filename2)
    print('Total time %f seconds' % (time.clock() - start))
    
if __name__ == '__main__':
    test_word_segment()
    